{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#计算test的预测准确率\n",
    "#基于用户的协同过滤算法\n",
    "import os\n",
    "import numpy as np\n",
    "#import matplotlib.pyplot as plt\n",
    "#import seaborn as sbn\n",
    "import pandas as pd\n",
    "#正常显示中文标签和负号\n",
    "#plt.rcParams['font.sans-serif'] = ['SimHei']  #用来正常显示中文标签\n",
    "#plt.rcParams['axes.unicode_minus'] = False  #用来正常显示负号\n",
    "import gc"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "DATA_PATH = os.path.join(os.getcwd(),'data','input')\n",
    "\n",
    "user_similarity = np.load(os.path.join(DATA_PATH,'user_similarity.npy'))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "TRAIN = os.path.join(DATA_PATH,'train.csv')\n",
    "train = pd.read_csv(TRAIN)\n",
    "train = train[['msno','song_id','target']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "n_users = 21965\n",
    "n_items = 30113\n",
    "train_data_matrix = np.zeros((n_users,n_items))\n",
    "for line in train.itertuples():\n",
    "    train_data_matrix[line[1]-1,line[2]-1] = line[3]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#下一步是预测，已经创建了相似性矩阵\n",
    "#下面基于相似性举证做预测\n",
    "#基于用户的评分可以根据用户均值评分\n",
    "#基于产品评分缺乏针对性\n",
    "def predict(ratings,similarity,type = 'user'):\n",
    "    if type == 'user':\n",
    "        mean_user_rating = ratings.mean(axis = 1)\n",
    "        #You use np.newaxis so that mean_user_rating has same format as ratings\n",
    "        ratings_diff = (ratings - mean_user_rating[:,np.newaxis])\n",
    "        pred = mean_user_rating[:,np.newaxis] + similarity.dot(ratings_diff)/np.array([np.abs(similarity).sum(axis=1)]).T\n",
    "    elif type == 'item':\n",
    "        pred = ratings.dot(similarity)/np.array([np.abs(similarity).sum(axis=1)])     \n",
    "    return pred"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "test_user_prediction = predict(train_data_matrix,user_similarity,type = 'user')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "#print(type(test_user_prediction))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "15"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "del user_similarity\n",
    "gc.collect()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1216963\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>msno</th>\n",
       "      <th>song_id</th>\n",
       "      <th>target</th>\n",
       "      <th>user_pre_target</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>6051</td>\n",
       "      <td>23194</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>17230</td>\n",
       "      <td>6575</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>4577</td>\n",
       "      <td>6861</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>21721</td>\n",
       "      <td>29949</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>17563</td>\n",
       "      <td>29373</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    msno  song_id  target  user_pre_target\n",
       "0   6051    23194       1                0\n",
       "1  17230     6575       1                0\n",
       "2   4577     6861       1                0\n",
       "3  21721    29949       0                0\n",
       "4  17563    29373       0                0"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "TEST = os.path.join(DATA_PATH,'test.csv')\n",
    "test = pd.read_csv(TEST)\n",
    "test = test[['msno','song_id','target']]\n",
    "test['user_pre_target'] =0\n",
    "print(len(test))\n",
    "test.head()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "6575"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#test.iat[1,1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#赋值\n",
    "for i in range(0,1216963):\n",
    "    test.iat[i,3] = test_user_prediction[test.iat[i,0],test.iat[i,1]]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>msno</th>\n",
       "      <th>song_id</th>\n",
       "      <th>target</th>\n",
       "      <th>user_pre_target</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>6051</td>\n",
       "      <td>23194</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>17230</td>\n",
       "      <td>6575</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>4577</td>\n",
       "      <td>6861</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>21721</td>\n",
       "      <td>29949</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>17563</td>\n",
       "      <td>29373</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    msno  song_id  target  user_pre_target\n",
       "0   6051    23194       1                0\n",
       "1  17230     6575       1                0\n",
       "2   4577     6861       1                0\n",
       "3  21721    29949       0                0\n",
       "4  17563    29373       0                0"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "test.head()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[[568010      0]\n",
      " [648953      0]]\n"
     ]
    }
   ],
   "source": [
    "#模型评价\n",
    "from sklearn import metrics\n",
    "from scipy import interp\n",
    "cm_test = metrics.confusion_matrix(test['target'],test['user_pre_target'])\n",
    "print(cm_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python [conda env:Anaconda3]",
   "language": "python",
   "name": "conda-env-Anaconda3-py"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.3"
  },
  "toc": {
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
